## ************************************************************************** ## 
## Project: Gov 2020 Project
## Author: Amelia Malpas, Leah Stein & David Struhs
## Date Created: September 27, 2024
## Date Edited: November 27, 2024
## Task: Construct main data set for analysis 
## ************************************************************************** ## 

## -------------------------------------------------------------------------- ## 
## Input: County data, Commuting Zones, Media Markets
##        CHIPS Location Data & 2016-2020 Presidential Election Data
## Output:  "chips_counties_with_pres_vote.RData" &
##          "final_data.Rdata"
## -------------------------------------------------------------------------- ## 

## -------------------------------------------------------------------------- ## 
## Set up: load libraries, define paths 
## -------------------------------------------------------------------------- ## 



library(tidyverse)
library(janitor)
library(stringi)
library(readxl)
library(writexl)


##load in updated CHIPS data (with county names)
chips_updated <- read_excel("data/data-AXG8O_updated.xlsx") %>% 
  clean_names() %>% 
  mutate(county = tolower(county)) #%>% 
#mutate(project_size_num_individual = as.numeric(format(project_size_num_individual, scientific = FALSE)))

##load in commuting zone data https://sites.psu.edu/psucz/data/ 
commuting_zone_20 <- read_csv("data/commuting zone 20.csv") %>% 
  clean_names() %>% 
  rename(county_fips = geoid)

##and media market data https://public.tableau.com/app/profile/amirgnyawali/viz/TVDMAmap/Sheet1
media_markets <- read_excel("data/media_markets.xlsx") %>% 
  clean_names() %>% 
  mutate(county = tolower(county), state = tolower(state)) %>%
  mutate(county = str_trim(county, side = "both"), state = str_trim(state, side = "both"))

media_markets <- media_markets[, c(1:3)]

## -------------------------------------------------------------------------- ## 
## Data Cleaning and Matching
## -------------------------------------------------------------------------- ## 


#data to match with county fips codes
geocodes <- read_excel("data/all-geocodes-v2020.xlsx") %>% 
  clean_names() %>% 
  rename(county = area_name_including_legal_statistical_area_description) %>% 
  mutate(county_fips = paste0(state_code_fips, county_code_fips, sep = "")) %>% 
  select(county, county_fips) %>% 
  distinct(county_fips, .keep_all = T) %>% 
  filter(county != "United States") %>% 
  mutate(state = ifelse(endsWith(county_fips, '000') == "TRUE", county, NA)) %>% 
  fill(state, .direction = "down") %>% 
  filter(county != state) %>% 
  mutate(county = tolower(county), state = tolower(state)) %>% 
  mutate(county = gsub("county", "", county), county = gsub("parish", "", county), county = gsub("census area", "", county), 
         county = gsub("municipality", "", county), county = gsub("borough", "", county), county = gsub("city", "", county), ) %>% 
  mutate(county = str_trim(county, side = "both"), state = str_trim(state, side = "both")) %>% 
  mutate(county = gsub('[[:punct:] ]+', ' ', county)) %>% 
  mutate(county = stringi::stri_trans_general(county, "Latin-ASCII")) %>% 
  filter(state != "puerto rico")

##final county cleaning
c_matching <- read_excel("data/c_matching.xlsx")[, c(1:3)] %>% 
  mutate(county_new = gsub("de", "de ", county)) %>%
  mutate(county_new = gsub("du", "du ", county_new)) %>% 
  mutate(county_new = gsub("la", "la ", county_new)) %>% 
  mutate(county_new = ifelse(county_new == "hills", "hillsborough", county_new)) %>%
  add_column(x = "x")


geocodes <- geocodes %>% 
  left_join(., c_matching) %>%
  mutate(county = ifelse(!is.na(x), county_new, county)) 

geocodes <- geocodes[, c(1:3)]


##Merge counties data together 
counties <- geocodes %>% 
  left_join(., media_markets) %>% 
  left_join(., commuting_zone_20) %>% 
  rename(media_market = tvdma, commuting_zone = cz20)

counties <- counties %>% 
  mutate(commuting_zone = ifelse(county_fips == "09001", 388, commuting_zone)) %>% 
  mutate(commuting_zone = ifelse(is.na(commuting_zone), 88, commuting_zone)) ##mismatch between Connecticut counties and new administrative zones


##consolidate CHIPS data to county level 

chips_distinct <- chips_updated %>% 
  group_by(county_fips) %>% 
  summarize(chips_county_project = sum(project_size_num_individual, na.rm = T)) %>% 
  add_column(chips_county = 1)

##Merge with counties data
chips_counties <- counties %>% 
  left_join(., chips_distinct) %>% 
  mutate(chips_county = ifelse(is.na(chips_county), 0, chips_county), 
         chips_county_project = ifelse(is.na(chips_county_project), 0, chips_county_project)) 

##Media markets and commuting zones with CHIPS investment 
chip <- chips_counties %>% 
  filter(chips_county == 1)

chip_mm <- chip$media_market

chip_cz <- chip$commuting_zone

chips_counties <- chips_counties %>% 
  mutate(chips_mm = ifelse(media_market %in% chip_mm, 1, 0)) %>% 
  mutate(chips_cz = ifelse(commuting_zone %in% chip_cz, 1, 0))



##One strategy we discussed: same media market but different commuting zone
chips_counties_mm <- chips_counties %>% 
  filter(chips_mm == 1)

## -------------------------------------------------------------------------- ## 
## Save CHIPS County Data
## -------------------------------------------------------------------------- ## 

save(chips_counties, chips_counties_mm, file = "data/chips_counties.rdata")


## -------------------------------------------------------------------------- ## 
## Load Presidential Election Data 2016-2020 and Merge onto CHIPS Data
## -------------------------------------------------------------------------- ## 


#Load Presidential vote by county 2016 and 2020 (Source: MIT Elections Database) 
data <- read.csv("data/countypres_2000-2020.csv", colClasses = c(county_fips = "character")) %>%
  filter(year == 2016 | year == 2020) %>%
  filter(party == "DEMOCRAT" | party == "REPUBLICAN")


# Create total vote counts for each year-state-county combination
county_total_check <- data %>%
  group_by(year, state, county_name, party) %>%
  summarize(has_total = any(mode == "TOTAL"), .groups = 'drop')

#Calculate sum of 'candidatevotes' for counties that lack a 'TOTAL' mode
new_total_rows <- data %>%
  filter(mode != "TOTAL") %>%
  group_by(year, state, county_name, county_fips, party, totalvotes) %>%
  summarize(candidatevotes = sum(candidatevotes, na.rm = TRUE), .groups = 'drop') %>%
  left_join(county_total_check, by = c("year", "state", "county_name", "party")) %>%
  filter(!has_total) %>%
  mutate(mode = "TOTAL")

#Bind the new 'TOTAL' rows with the original data
final_data <- bind_rows(data, new_total_rows) %>%
  arrange(year, state, county_name, party) %>% 
  filter(mode == "TOTAL")


#Clean and prepare data for merge (2016-2020)
vote <- final_data %>%
  mutate(
    county_fips = str_pad(county_fips, width = 5, pad = "0"), # Add leading zero if needed
    vote_percent = round((candidatevotes / totalvotes) * 100, 2)
  ) %>%
  group_by(year, state, county_name, county_fips, party) %>%
  summarise(vote_percent = mean(vote_percent, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(
    names_from = c(party, year),
    values_from = vote_percent,
    names_sep = "_"
  ) %>%
  rename(
    dem_vote_2016 = DEMOCRAT_2016,
    rep_vote_2016 = REPUBLICAN_2016,
    dem_vote_2020 = DEMOCRAT_2020,
    rep_vote_2020 = REPUBLICAN_2020,
    county = county_name)

## Prep for merge
load("data/chips_counties.rdata")

vote <- vote %>%
  mutate(
    county = tolower(county),
    county = str_replace_all(county, "[[:punct:]]", ""),  # Remove punctuation
    county = str_replace_all(county, " ", ""),  # Remove spaces
    county = str_trim(county),  # Trim leading and trailing spaces
    county = gsub("county|parish|borough|censusarea|municipality", "", county),  # Remove suffixes
    state = tolower(state)
  )

vote <- vote %>%
  mutate(county = ifelse(county_fips == "29510" & county == "stlouiscity", "stlouis", county)) %>%
  filter(county_fips != "2938000")

# Further clean and standardize 'county' and 'state' columns in chips_counties
chips_counties <- chips_counties %>%
  mutate(
    county = tolower(county),
    county = str_replace_all(county, "[[:punct:]]", ""),  # Remove punctuation
    county = str_replace_all(county, " ", ""),  # Remove spaces
    county = str_trim(county),  # Trim leading and trailing spaces
    county = gsub("county|parish|borough|censusarea|municipality", "", county),  # Remove suffixes
    state = tolower(state)
  )

merged_data <- vote %>%
  left_join(chips_counties, by = c("state", "county_fips", "county"))

write.csv(merged_data, "data/chips_counties_with_pres_vote.csv")
save(merged_data, file = "data/chips_counties_with_pres_vote.RData")


## -------------------------------------------------------------------------- ##
## Add 2024 Presidential Election Data (Source: Noah Dasanaike)
## -------------------------------------------------------------------------- ##

df <- read.csv("data/results_2024.csv", colClasses = c(county_fips = "character"))

## -------------------------------------------------------------------------- ##
## Create DEM vote share and filter to CHIPS counties
## -------------------------------------------------------------------------- ##

df$pct <- (df$candidatevotes/df$totalvotes)*100

df <- df %>% filter(party == "DEMOCRAT" | party == "REPUBLICAN") %>%
  select(county_fips, party, pct, year)

df <- df %>%
  mutate(county_fips = str_pad(county_fips, width = 5, pad = "0"))

df_long <- df %>% pivot_wider(
  names_from = party, 
  values_from = pct, 
  names_prefix = "vote_2024_") %>%
  rename(dem_vote_2024 = vote_2024_DEMOCRAT, rept_vote_2024 = vote_2024_REPUBLICAN) %>%
  select(-year)

## -------------------------------------------------------------------------- ##
## Save File
## -------------------------------------------------------------------------- ##


merged_result <- left_join(merged_data, df_long, by = "county_fips")

write.csv(merged_result, "data/final_data.csv")
save(merged_result, file = "data/final_data.Rdata")

